*******************************************************************************
* this do file is for: 1) clean data used in Kern and Hainmueller, 2009, PA;
*                      2) replicating KH (2009, PA);
*                      3) generate cleaned data;
*                      4) generate extension results for KH (2009, PA).
* data used in this file: za6008.dta
* data generated by this file: kh_2009pa_cleaned.dta
* date: June 14, 2021
*******************************************************************************

* log file
log using "C:\Users\arthu\Dropbox\research\mte_at\codes\replication\results\kh_2009pa_clean.log"

* global command
global data "C:\Users\arthu\Dropbox\research\mte_at\codes\replication\data"

*******************************************************************************
* prepare: data cleaning, gen var, and gen data
*******************************************************************************

* read data
use "$data\za6008.dta", clear

***********************
* check var: instrument
***********************

tab u88_0600
label var u88_0600 "investigation areas"
// 1 = schwerin, 2 = magdeburg, 3 = berlin, 4 = cottbus, 5 = leipzig
// 6 = dresden, 7 = karl-marx-stadt, 8 = erfurt

**********************
* check var: treatment
**********************

tab u88_0079
label var u88_0079 "media reception - television of the brd"
// 1 = toglich = daily, 2 = mehrmals wöchentlich = several times a week
// 3 = einmal wöchentlich = once a week, 4 = seltener = less common
// 5 = 0berhaupt nicht = not at all

********************
* check var: outcome
********************

tab u88_0037
label var u88_0037 "convinced of Leninist/Marxist worldview"
// 1 = vollkommen = perfectly, 2 = mit einschränkungen = with restrictions
// 3 = kaum = barely, 4 = 0berhaupt nicht = not at all

tab u88_0039
label var u88_0039 "feel closely attached to East Germany"
// coding rule is the same with u88_0037

tab u88_0040
label var u88_0040 "poli power is exercised in ways consistent with my views"
// coding rule is the same with u88_0037

*********************************************
* check var: limited set of control variables
*********************************************

tab u88_0085
label var u88_0085 "age"
gen age = u88_0085
label var age "age"

gen age_bin_5 = .
label var age_bin_5 "age cohort: 5 years"
replace age_bin_5 = 1 if age < 21
replace age_bin_5 = 2 if age > 20 & age < 26
replace age_bin_5 = 3 if age > 25 & age < 31
replace age_bin_5 = 4 if age > 30 & age < 36
replace age_bin_5 = 5 if age > 35 & age < 41
replace age_bin_5 = 6 if age > 40 & age < 51
tab age_bin_5

gen age_bin_10 = .
label var age_bin_10 "age cohort: 10 years"
replace age_bin_10 = 1 if age < 21
replace age_bin_10 = 2 if age > 20 & age < 31
replace age_bin_10 = 3 if age > 30 & age < 41
replace age_bin_10 = 4 if age > 40 & age < 51

tab u88_0086
label var u88_0086 "gender"
// 1 = männlich = male; 2 = weiblich = female
gen gender = u88_0086
// 1 = male, 2 = female
label var gender "gender"

tab u88_0095
label var u88_0095 "occupational classification for father"
gen father_occ = u88_0095
// 1 = ohne berufsabschluss = no professional qualification,
// 2 = teilfacharbeiter = partial skilled worker,
// 3 = facharbeiterabschluss = skilled worker qualification,
// 4 = meister abschlu0 = master degree,
// 5 = fachschule abschlu0 = technical school graduation,
// 6 = hochschul/uniabschl = university/university graduate
label var father_occ "occupational classification for father"

tab u88_0096
label var u88_0096 "occupational classification for mother"
gen mother_occ = u88_0096 // coding rule is the same with father
label var mother_occ "occupational classification for mother"

******************************************
* check var: full set of control variables
******************************************

tab u88_0089
// 1 = ledig = single, 2 = verheiratet = married
// 3 = geschieden = divorced, 4 = verwitwet = widowed
label var u88_0089 "marital status"
gen marital = u88_0089 // 1 = single, 2 = married, 3 = divorced, 4 = widowed
label var marital "marital status"

tab u88_0091
// 1 = keine = no, 2 = ein kind = a child
// 3 = zwei kinder = two children, 4 = drei kinder = three children
// 5 = mehr als drei kinder = more than three children
label var u88_0091 "number of children"
gen numb_child = u88_0091 // 0 = 0, 1 = 1, 2 = 2, 3 = 3, 4 = more than 3
label var numb_child "number of children"

tab u88_0092
// 1 = bis 8. klasse pos = up to 8th grade pos,
// 2 = 8. klasse pos = 8th grade pos
// 3 = 10. klasse pos = 10th class pos,
// 4 = 12. klasse eos_abi = 12th grade eos_abi
label var u88_0092 "education"
gen educ = u88_0092
// 1 = less than high school, 2 = some high school
// 3 = standard school, 4 = special advanced school
label var educ "education"

tab u88_0093
// 1 = noch sch0ler = still students, 2 = noch lehrling = still apprentice
// 3 = noch student = still a student,
// 4 = ohne berufsabschluss = without a professional qualification
// 5 = teilfacharbeiter = partial skilled worker
// 6 = facharbeiterabschluss = skilled worker qualification
// 7 = meister abschlu0 = master degree
// 8 = fachschule abschlu0 = technical school graduation
// 9 = hochschul/uni abschl. = university degree
label var u88_0093 "professional qualification"
gen prof_quali = u88_0093
// 1 = still in school, 2 = stll an apprentice
// 3 = still at university, 4 = without qualification, 5 = some skill
// 6 = certified with skill, 7 = master certified,
// 8 = professional school, 9 = university graduate
label var prof_quali "professional qualification"

tab u88_0102
label var u88_0102 "income"
gen inc = u88_0102
label var inc "income"

tab u88_0101
// 1 = ja,vollzeitbesch0ft. = yes, full-time
// 2 = ja,teilzeitbesch0ft. = yes, part-time
// 3 = nein, zur zeit nicht = no, not at this time
// 4 = nein,noch ausbildung = no, still training
label var u88_0101 "employment status"
gen employ = u88_0101
// 1 = full time, 2 = part time, 3 = not right now
// 4 = still training
label var employ "employment status"

********************
* generate variables
********************

* gen: instrument (ignore missing values)
gen not_dresden = 0 if u88_0600 == 6 // if live in dresden, not_dresden = 0
replace not_dresden = 1 if u88_0600 != 6 & u88_0600 != . // if not live in dresden, not_dresden = 1
label var not_dresden "not live in Dresden district"

* gen: treatment (ignore missing values)
gen wg_tv = 0 if u88_0079 == 5 // u88_0079 == 5 means ``0berhaupt nicht/not at all''
replace wg_tv = 1 if u88_0079 != 5 & u88_0079 != .
label var wg_tv "watch West German TV"

* gen: outcome (recode to match with Table 2, and ignore missing values)
foreach var of varlist u88_0037 u88_0039 u88_0040{

  gen `var'_kh = 1 if `var' == 4
  replace `var'_kh = 2 if `var' == 3
  replace `var'_kh = 3 if `var' == 2
  replace `var'_kh = 4 if `var' == 1
  
}
gen lenin_kh = u88_0037_kh
label var lenin_kh "convinced of Leninist/Marxist worldview"
gen east_ger_kh = u88_0039_kh
label var east_ger_kh "feel closely attached to East Germany"
gen poli_pow_kh = u88_0040_kh
label var poli_pow_kh "political power is exercised in ways consistent with my views"

* gen: binary outcome (ignore missing values)
foreach var of varlist u88_0037 u88_0039 u88_0040{

  gen `var'_b = 0 if `var' == 3 | `var' == 4
  replace `var'_b = 1 if `var' == 1 | `var' == 2
  
}
gen lenin_b = u88_0037_b
label var lenin_b "convinced of Leninist/Marxist worldview"
gen east_ger_b = u88_0039_b
label var east_ger_b "feel closely attached to East Germany"
gen poli_pow_b = u88_0040_b
label var poli_pow_b "political power is exercised in ways consistent with my views"

* sum: viewership by iv (replicate table 1)
forvalues i = 0(1)1{

  tab u88_0079 if not_dresden == `i'

}

* sum: political attitudes (replicate table 2)
foreach var of varlist lenin_kh east_ger_kh poli_pow_kh{

  tab `var'
  
}

* drop missing values
drop if u88_0037 == . | u88_0039 == . | u88_0040 == . | not_dresden == . | wg_tv == .

***********************************
* gen data: generate data set for R
***********************************

* keep useful variables
keep lenin_b east_ger_b poli_pow_b not_dresden wg_tv age gender father_occ ///
  mother_occ age_bin_5 age_bin_10 lenin_kh east_ger_kh poli_pow_kh
  
* drop observations with missing values on y, t, z, and covariates
drop if lenin_b == . | east_ger_b == . | poli_pow_b == . | not_dresden == . | ///
  wg_tv == . | age == . | gender == . | father_occ == . | mother_occ == .

* rename variables
rename lenin_b lenin
rename east_ger_b east_ger
rename poli_pow_b poli_pow

rename wg_tv treatment
rename not_dresden iv

* save data
save "$data\kh_2009pa_cleaned.dta", replace

* close log file
log close
